pip install scorecardpy
pip install lightgbm
# Library Import
import os
import gc
import random
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import missingno as msno
import plotly.express as px
import warnings
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression, Lasso
import scorecardpy as sc
from plotly.offline import init_notebook_mode, plot
init_notebook_mode()
# igore warning message
warnings.filterwarnings("ignore")
# pandas display option setting
pd.set_option('display.max_columns', None)
%matplotlib inline
# File List Check
print(os.listdir("./input"))
app_test = pd.read_csv('./input/application_test.csv')
app_train = pd.read_csv('./input/application_train.csv')
bureau = pd.read_csv('./input/bureau.csv')
bureau_bal = pd.read_csv('./input/bureau_balance.csv')
cc_bal = pd.read_csv('./input/credit_card_balance.csv')
cash_bal = pd.read_csv('./input/POS_CASH_balance.csv')
pre_app = pd.read_csv('./input/previous_application.csv')
ins_pay = pd.read_csv('./input/installments_payments.csv')
# Data Shape Check
print (app_test.shape, app_train.shape, bureau.shape, bureau_bal.shape, cc_bal.shape, cash_bal.shape, pre_app.shape, ins_pay.shape)
# Train / Test Data Merge
app_train['DataCategory'] = 'TRAIN'
app_test['DataCategory'] = 'TEST'
all_app = pd.concat([app_train, app_test], axis=0)
print (all_app.shape)
"""
# Category Type Column Name
"""
cat_cols = [col for col in all_app.select_dtypes(include='object').columns if col not in ['DataCategory']]
print (cat_cols)
print ('### Category Column Unique Values ###')
print ('-'*100)
for col in cat_cols:
print ('%s : %s'%(col, list(all_app[col].unique())))
"""
# 데이터 요약 정보 확인
"""
all_app.describe()
all_app[all_app['DAYS_EMPLOYED'] >0]['DAYS_EMPLOYED'].describe()
all_app['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)
# Missing Value 존재하는 컬럼
missing_cols = app_train.columns[app_train.isnull().any()].tolist()
print ('## Missing Value # of Columns : %s'%len(missing_cols))
print (missing_cols)
app_train[missing_cols].isnull().sum().reset_index().rename(columns={0:'missing_cnt'}).sort_values('missing_cnt', ascending=False)[:1]
"""
# Missing Value Count Check
"""
msno.bar(app_train[missing_cols], figsize=(16, 5), color='#34495e',labels=True, fontsize=8)
"""
# Data Completeness Check
"""
msno.matrix(app_train[missing_cols], figsize=(14, 5), labels=True, fontsize=8)
default_agg = {
'TARGET' : ['sum', 'count']
}
cont_type_agg = app_train.groupby(['NAME_CONTRACT_TYPE']).agg({**default_agg}).reset_index()
cont_type_agg.columns = ['_'.join(col).strip() for col in cont_type_agg.columns.values]
cont_type_agg['TARGET_ratio'] = cont_type_agg['TARGET_sum']/cont_type_agg['TARGET_count'] * 100
cont_type_agg['CRatio'] = cont_type_agg['TARGET_count']/cont_type_agg['TARGET_count'].sum() * 100
cont_type_agg.head()
def_ratio = list((cont_type_agg['TARGET_ratio'].astype(str).values))
loan_type = list(cont_type_agg['NAME_CONTRACT_TYPE_'].values)
labels = []
for (item1, item2) in zip(loan_type, def_ratio):
labels.append(item1 + '(Default Ratio:' + item2[:3] + '%)')
fig = px.pie(cont_type_agg, values='CRatio', names=labels, title='Product Component Ratio')
fig.show()
col_name = 'CODE_GENDER'
agg_df = app_train.groupby([col_name]).agg({**default_agg}).reset_index()
agg_df.columns = ['_'.join(col).strip() for col in agg_df.columns.values]
agg_df['TARGET_ratio'] = np.round(agg_df['TARGET_sum']/agg_df['TARGET_count'] * 100, 2)
agg_df['CRatio'] = np.round(agg_df['TARGET_count']/agg_df['TARGET_count'].sum() * 100, 4)
agg_df.head()
def_ratio = list((agg_df['TARGET_ratio'].astype(str).values))
loan_type = list(agg_df[col_name+'_'].values)
labels = []
for (item1, item2) in zip(loan_type, def_ratio):
labels.append(item1 + '(Default Ratio:' + item2 + '%)')
fig = px.pie(agg_df, values='CRatio', names=labels, title='Gender Component Ratio')
fig.show()
col_name = 'FLAG_OWN_CAR'
agg_df = app_train.groupby([col_name]).agg({**default_agg}).reset_index()
agg_df.columns = ['_'.join(col).strip() for col in agg_df.columns.values]
agg_df['TARGET_ratio'] = np.round(agg_df['TARGET_sum']/agg_df['TARGET_count'] * 100, 2)
agg_df['CRatio'] = np.round(agg_df['TARGET_count']/agg_df['TARGET_count'].sum() * 100, 4)
agg_df.head()
col_name = 'FLAG_OWN_REALTY'
agg_df = app_train.groupby([col_name]).agg({**default_agg}).reset_index()
agg_df.columns = ['_'.join(col).strip() for col in agg_df.columns.values]
agg_df['TARGET_ratio'] = np.round(agg_df['TARGET_sum']/agg_df['TARGET_count'] * 100, 2)
agg_df['CRatio'] = np.round(agg_df['TARGET_count']/agg_df['TARGET_count'].sum() * 100, 4)
agg_df.head()
col_name = 'CNT_CHILDREN'
agg_df = app_train.groupby([col_name]).agg({**default_agg}).reset_index()
agg_df.columns = ['_'.join(col).strip() for col in agg_df.columns.values]
agg_df['TARGET_ratio'] = np.round(agg_df['TARGET_sum']/agg_df['TARGET_count'] * 100, 2)
agg_df['CRatio'] = np.round(agg_df['TARGET_count']/agg_df['TARGET_count'].sum() * 100, 4)
agg_df.head()
def_ratio = list((agg_df['TARGET_ratio'].astype(str).values))
loan_type = list(agg_df[col_name+'_'].astype(str).values)
labels = []
for (item1, item2) in zip(loan_type, def_ratio):
labels.append(item1 + '(Default Ratio:' + item2 + '%)')
fig = px.pie(agg_df, values='CRatio', names=labels, title='# of Children Component Ratio')
fig.show()
col_name = 'NAME_TYPE_SUITE'
agg_df = app_train.groupby([col_name]).agg({**default_agg}).reset_index()
agg_df.columns = ['_'.join(col).strip() for col in agg_df.columns.values]
agg_df['TARGET_ratio'] = np.round(agg_df['TARGET_sum']/agg_df['TARGET_count'] * 100, 2)
agg_df['CRatio'] = np.round(agg_df['TARGET_count']/agg_df['TARGET_count'].sum() * 100, 4)
agg_df = agg_df.sort_values('TARGET_ratio')
agg_df.head()
def_ratio = list((agg_df['TARGET_ratio'].astype(str).values))
loan_type = list(agg_df[col_name+'_'].astype(str).values)
labels = []
for (item1, item2) in zip(loan_type, def_ratio):
labels.append(item1 + '(Default Ratio:' + item2 + '%)')
fig = px.pie(agg_df, values='CRatio', names=labels, title='NAME_TYPE_SUITE Component Ratio')
fig.show()
col_name = 'NAME_INCOME_TYPE'
agg_df = app_train.groupby([col_name]).agg({**default_agg}).reset_index()
agg_df.columns = ['_'.join(col).strip() for col in agg_df.columns.values]
agg_df['TARGET_ratio'] = np.round(agg_df['TARGET_sum']/agg_df['TARGET_count'] * 100, 2)
agg_df['CRatio'] = np.round(agg_df['TARGET_count']/agg_df['TARGET_count'].sum() * 100, 4)
agg_df = agg_df.sort_values('TARGET_ratio')
agg_df.head()
def_ratio = list((agg_df['TARGET_ratio'].astype(str).values))
loan_type = list(agg_df[col_name+'_'].astype(str).values)
labels = []
for (item1, item2) in zip(loan_type, def_ratio):
labels.append(item1 + '(Default Ratio:' + item2 + '%)')
fig = px.pie(agg_df, values='CRatio', names=labels, title='%s Component Ratio'%col_name)
fig.show()
col_name = 'NAME_EDUCATION_TYPE'
agg_df = app_train.groupby([col_name]).agg({**default_agg}).reset_index()
agg_df.columns = ['_'.join(col).strip() for col in agg_df.columns.values]
agg_df['TARGET_ratio'] = np.round(agg_df['TARGET_sum']/agg_df['TARGET_count'] * 100, 2)
agg_df['CRatio'] = np.round(agg_df['TARGET_count']/agg_df['TARGET_count'].sum() * 100, 4)
agg_df = agg_df.sort_values('TARGET_ratio')
agg_df.head()
"""
# Label encoding
"""
def label_encoding(df, cat_cols):
le = LabelEncoder()
#cat_cols = [col for col in df.select_dtypes(include='category').columns]
for col in cat_cols:
df[col] = df[col].fillna('unknown')
df[col+'_le'] = le.fit_transform(df[col])
return df
"""
# Application Data Label Encoding
"""
all_app_label = label_encoding(all_app, cat_cols)
all_app_label.head()
print (all_app_label.shape)
# Category Features drop
all_app_label.drop(cat_cols, axis = 1, inplace=True)
print (all_app_label.shape)
bureau.head()
"""
# 이전 대출 건수 변수 생성
"""
previous_loan_counts = bureau.groupby('SK_ID_CURR', as_index=False)['SK_ID_BUREAU'].count().rename(columns = {'SK_ID_BUREAU': 'previous_loan_counts'})
previous_loan_counts.head()
"""
# Bureau Data의 count, mean, max, min, sum, std 변수 생성
"""
bureau_agg = bureau.drop(columns = ['SK_ID_BUREAU']).groupby('SK_ID_CURR', as_index = False).agg(['count', 'mean', 'max', 'min', 'sum', 'std']).reset_index()
bureau_agg.head()
columns = ['SK_ID_CURR']
"""
# Bureau 변수 Column Name 생성 Bureau Column name + count/mean/max/min/sum/std
"""
for var in bureau_agg.columns.levels[0]:
if var != 'SK_ID_CURR':
for stat in bureau_agg.columns.levels[1][:-1]:
columns.append('bureau_%s_%s' % (var, stat))
# Column Name Change
bureau_agg.columns = columns
bureau_agg.head()
# Feature Engineering Result Merge
all_df = all_app_label.merge(previous_loan_counts, on = 'SK_ID_CURR', how = 'left')
all_df = all_df.merge(bureau_agg, on = 'SK_ID_CURR', how = 'left')
all_df.head()
"""
# 전체 Feature List
"""
feats = [col for col in all_df.columns if col not in ['SK_ID_CURR', 'TARGET', 'DataCategory']]
print (len(feats))
"""
# Missing Value를 mean값으로 변환
"""
all_df.fillna(all_df.mean(), inplace=True)
"""
# Data Split & Normalization
"""
X_train = all_df[all_df['DataCategory']=='TRAIN'][feats]
X_test = all_df[all_df['DataCategory']=='TEST'][feats]
y_train = all_df[all_df['DataCategory']=='TRAIN']['TARGET']
y_test = all_df[all_df['DataCategory']=='TEST']['TARGET']
# Normalization
scaler = StandardScaler()
scaler.fit(all_df[feats])
X_train_norm = scaler.transform(X_train)
X_test_norm = scaler.transform(X_test)
print (len(X_train_norm), len(X_test_norm))
rfe_selector = RFE(estimator=Lasso(alpha=0.01, max_iter=3000), n_features_to_select=100, step=10, verbose=5)
rfe_selector.fit(X_train_norm, y_train)
rfe_support = rfe_selector.get_support()
rfe_feature = X_train.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'RFE Lasso selected features')
print ('-'*120)
print (rfe_feature)
"""
# Weight of evidence를 활용한 Binning
"""
bins = sc.woebin(all_df[all_df['DataCategory']=='TRAIN'], y="TARGET")
%%time
"""
# Information Value Check
"""
info_value = sc.iv(all_df[all_df['DataCategory']=='TRAIN'], y="TARGET", x=rfe_feature)
iv_feats = list(info_value[info_value['info_value']>=0.1]['variable'])
print (len(iv_feats), iv_feats)
"""
# Train/Test Data를 WOE Bin 값으로 변경
"""
train_woe = sc.woebin_ply(X_train[iv_feats], bins)
test_woe = sc.woebin_ply(X_test[iv_feats], bins)
%%time
"""
# LogisticRegression 모형 학습
"""
lr = LogisticRegression(penalty='l1', C=0.9, solver='saga', n_jobs=-1)
lr.fit(train_woe, y_train)
# predicted proability
train_pred = lr.predict_proba(train_woe)[:,1]
test_pred = lr.predict_proba(test_woe)[:,1]
print (roc_auc_score(y_train, train_pred))
"""
# 학습한 LogisticRegression 모형과 WOE binning 결과를 기준으로 Scorecard 생성
"""
card = sc.scorecard(bins, lr, train_woe.columns)
"""
# Scorecard Sample
"""
card['bureau_AMT_CREDIT_SUM_LIMIT_sum']
%%time
"""
# Scorecard 기준 Score 계산
"""
train_score = sc.scorecard_ply(X_train[iv_feats], card, print_step=1)
test_score = sc.scorecard_ply(X_test[iv_feats], card, print_step=1)
"""
# 예측 등급의 안정성을 체크하기 위한 PSI(Population Stability Index) 지표 체크
"""
psi_chk = sc.perf_psi(
score = {'train':train_score, 'test':test_score},
label = {'train':y_train, 'test':y_test},
x_tick_break = 30, #등급별 점수 Range
return_distr_dat=True,
show_plot=False
)
"""
# PSI 지표 결과
- PSI < 0.1 : 안정
- 0.1 <= PSI < 0.25 : 다소 불안정
- PSI >= 0.25 : 불안성
"""
psi_chk['psi']
psi_chk
N_FOLDS=5
def objective(params, iteration):
"""
# Objective Function
"""
if 'num_boost_round' in params.keys():
del params['num_boost_round']
cv_results = lgb.cv(params, dtrain, num_boost_round = 1000, nfold=N_FOLDS, early_stopping_rounds=50, metrics=['auc'], seed=2020)
score = cv_results['auc-mean'][-1]
estimators = len(cv_results['auc-mean'])
params['num_boost_round'] = estimators
return [score, params, iteration]
random.seed(2020)
MAX_EVALS = 100
def random_search(param_grid, max_evals=MAX_EVALS):
"""
# Random Search for Hyperparameter optimization
"""
results=pd.DataFrame(columns=['score', 'params', 'iteration'], index=list(range(max_evals)))
for i in range(max_evals):
# Choose Random Params in param_grid
hyperparams = {k:random.sample(v,1)[0] for k,v in param_grid.items()}
hyperparams['subsample'] = 1.0 if hyperparams['boosting_type']=='goss' else hyperparams['subsample']
eval_results=objective(hyperparams, i)
results.loc[i,:]=eval_results
results.sort_values('score', ascending=False, inplace=True)
results.reset_index(inplace=True)
return results
"""
# Lightgbm Hyperparameter Grid
"""
lgb_param_grid = {
'objective ' : ['binary'],
'boosting_type' : ['gbdt'],
'num_leaves' : list(range(20, 150)),
'learning_rate' : list(np.logspace(np.log10(0.005), np.log10(0.3), base=10, num=100)),
'subsample_for_bin' : list(range(2000, 10000, 2000)),
'min_child_samples' : list(range(20, 500, 5)),
'reg_alpha' : list(np.linspace(0, 1)),
'reg_lambda' : list(np.linspace(0, 1)),
'colsample_bytree' : list(np.linspace(0.6, 0.9, 10)),
'subsample' : list(np.linspace(0.5, 0.9, 10)),
'drop_rate': list(np.linspace(0.01, 0.05, 10))
}
# Data Type Change
dtrain = lgb.Dataset(data = X_train[rfe_feature], label=y_train)
%%time
print ('Running Hyperparameter Search...')
random_results = random_search(lgb_param_grid, 15)
random_results.head()
# Hyperparameter Setting
sel_params = random_results['params'].values
sel_param = sel_params[0]
print (sel_param)
# Model Train
gbdt = lgb.train(sel_param, dtrain)
pred_train = gbdt.predict(X_train[rfe_feature])
pred_test = gbdt.predict(X_test[rfe_feature])
feat_importance = pd.DataFrame()
feat_importance['feature'] = rfe_feature
feat_importance['importance'] = gbdt.feature_importance()
feat_importance = feat_importance[feat_importance['importance']>0]
feat_importance.shape
feat_importance
plt.figure(figsize=(8, 10))
sns.barplot(x="importance", y="feature", data=feat_importance.sort_values(by="importance", ascending=False))
plt.title('LightGBM Features')
plt.yticks(size=8)
plt.tight_layout()
print ('Train ROC Score : %s'%roc_auc_score(y_train, pred_train))
submission = pd.read_csv('./input/sample_submission.csv')
submission['TARGET'] = pred_test
submission.to_csv('./input/submission.csv', index=False)